Import libraries

library(data.table)
library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.1     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()   masks data.table::between()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()

Importing

I import directly the raw file from github.

customers <- fread("https://raw.githubusercontent.com/SteffiPeTaffy/machineLearningAZ/master/Machine%20Learning%20A-Z%20Template%20Folder/Part%204%20-%20Clustering/Section%2025%20-%20Hierarchical%20Clustering/Mall_Customers.csv")

Let!s explore the data

glimpse(customers)
## Observations: 200
## Variables: 5
## $ CustomerID               <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
## $ Genre                    <chr> "Male", "Male", "Female", "Female", "Fe…
## $ Age                      <int> 19, 21, 20, 23, 31, 22, 35, 23, 64, 30,…
## $ `Annual Income (k$)`     <int> 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,…
## $ `Spending Score (1-100)` <int> 39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14…
summary(customers)
##    CustomerID        Genre                Age        Annual Income (k$)
##  Min.   :  1.00   Length:200         Min.   :18.00   Min.   : 15.00    
##  1st Qu.: 50.75   Class :character   1st Qu.:28.75   1st Qu.: 41.50    
##  Median :100.50   Mode  :character   Median :36.00   Median : 61.50    
##  Mean   :100.50                      Mean   :38.85   Mean   : 60.56    
##  3rd Qu.:150.25                      3rd Qu.:49.00   3rd Qu.: 78.00    
##  Max.   :200.00                      Max.   :70.00   Max.   :137.00    
##  Spending Score (1-100)
##  Min.   : 1.00         
##  1st Qu.:34.75         
##  Median :50.00         
##  Mean   :50.20         
##  3rd Qu.:73.00         
##  Max.   :99.00
colSums(is.na(customers))
##             CustomerID                  Genre                    Age 
##                      0                      0                      0 
##     Annual Income (k$) Spending Score (1-100) 
##                      0                      0

There is no NA data. Cool! Then dive into clustering.

Plot 3D

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
customers$Genre <- as.factor(customers$Genre)

names(customers) <- c('id', 'genre', 'age', 'annual_income', 'spending_score')
customers <- customers %>% select(-id)

str(customers)
## Classes 'data.table' and 'data.frame':   200 obs. of  4 variables:
##  $ genre         : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ age           : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ annual_income : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ spending_score: int  39 81 6 77 40 76 6 94 3 72 ...
##  - attr(*, ".internal.selfref")=<externalptr>
p <- plot_ly(customers, x = ~age, y = ~annual_income, z = ~spending_score, color = ~genre, colors = c('#BF382A', '#0C4B8E')) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'age'),
                     yaxis = list(title = 'annual_income'),
                     zaxis = list(title = 'spending_score')))


p

Implementing K-means

set.seed(20)
clusters <- kmeans(customers[,2:4], 5)
customers$cluster <- as.factor(clusters$cluster)

p2 <- plot_ly(customers, x = ~age, y = ~annual_income, z = ~spending_score, color = ~cluster, colors = c('#BF382A', '#0C4B8E')) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'age'),
                     yaxis = list(title = 'annual_income'),
                     zaxis = list(title = 'spending_score')))


p2

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.